#
#	This file contains R functions to implement the Polyfit add-on to DESeq and edgeR
#	described in the paper "Improved error estimates for the analysis of differential 
#	expression from RNA-­‐seq data"
#
#	pfNbinomTest()
#		replaces the DESeq function nbinomTest(); produces a p-value distribution 
#		without the 'flagpole' at p=1
#
#	pfNbinomTestForMatrices()
#		replaces the DESeq function nbinomTestForMatrices(); is called by pfNbinomTest()
#
#	pfExactTest()
#		replaces the edgeR function exactTest(); produces a p-value distribution 
#		without the 'flagpole' at p=1
#
#	pfExactTestDoubleTail()
#		replaces the edgeR function exactTestDoubleTail(); is called by pfExactTest()
#
#	twoSidedPValueFromDiscrete()
#		function to produce a 2-sided p-value with a uniform distribution on [0, 1] 
#		from a discrete distribution; is called by pfNbinomTestForMatrices() and 
#		by pfExactTestDoubleTail()
#
#	levelPValues()
#		Function to level out a p-value spectrum generated by DESeq or edgeR by fitting a 
#		quadratic function to the right hand portion of the spectrum, and to produce 
#		'corrected' p-values and q-values using an adapted version of the 
#		Storey-Tibsharini procedure
	 	
#########################################################################################
#
#	The function replacing the original function nbinomTest()
#
#	Conrad Burden, April 2013
#	Adapted from original R code by Simon Anders)
#
pfNbinomTest <- function (cds, condA, condB, pvals_only = FALSE, eps = NULL) 
{
    stopifnot(is(cds, "CountDataSet"))
    if (cds@multivariateConditions) 
        stop("For CountDataSets with multivariate conditions, only the GLM-based test can be used.")
    if (all(is.na(dispTable(cds)))) 
        stop("Call 'estimateDispersions' first.")
    if (dispTable(cds)[condA] == "blind" || dispTable(cds)[condB] == 
        "blind") {
        if (fitInfo(cds, "blind")$sharingMode != "fit-only") 
            warning("You have used 'method=\"blind\"' in estimateDispersion without also setting 'sharingMode=\"fit-only\"'. This will not yield useful results.")
    }
    stopifnot(condA %in% levels(conditions(cds)))
    stopifnot(condB %in% levels(conditions(cds)))
    if (!is.null(eps)) 
        warning("The 'eps' argument is defunct and hence ignored.")
    colA <- conditions(cds) == condA
    colB <- conditions(cds) == condB
    bmv <- getBaseMeansAndVariances(counts(cds)[, colA | colB], 
        sizeFactors(cds)[colA | colB])
    rawScvA <- fData(cds)[, paste("disp", dispTable(cds)[condA], 
        sep = "_")]
    rawScvB <- fData(cds)[, paste("disp", dispTable(cds)[condB], 
        sep = "_")]
#
#	Original code commented out
#	
#    pval <- nbinomTestForMatrices(counts(cds)[, colA], counts(cds)[, 
#
#	Replacement code calls new version of NbinomTestForMatrices()
#
    pval <- pfNbinomTestForMatrices(counts(cds)[, colA], counts(cds)[, 
#
        colB], sizeFactors(cds)[colA], sizeFactors(cds)[colB], 
        rawScvA, rawScvB)
    if (pvals_only) 
        pval
    else {
        bmvA <- getBaseMeansAndVariances(counts(cds)[, colA], 
            sizeFactors(cds)[colA])
        bmvB <- getBaseMeansAndVariances(counts(cds)[, colB], 
            sizeFactors(cds)[colB])
        data.frame(id = rownames(counts(cds)), baseMean = bmv$baseMean, 
            baseMeanA = bmvA$baseMean, baseMeanB = bmvB$baseMean, 
            foldChange = bmvB$baseMean/bmvA$baseMean, log2FoldChange = log2(bmvB$baseMean/bmvA$baseMean), 
            pval = pval, padj = p.adjust(pval, method = "BH"), 
            stringsAsFactors = FALSE)
    }
}
#
#########################################################################################
#
#	The function replacing the original function nbinomTestForMatrices()
#
#	Conrad Burden, April 2013
#	Adapted from original R code by Simon Anders)
#
pfNbinomTestForMatrices <- 
	function (countsA, countsB, sizeFactorsA, sizeFactorsB, dispsA, 
    dispsB) 
{
    kAs <- rowSums(cbind(countsA))
    kBs <- rowSums(cbind(countsB))
    mus <- rowMeans(cbind(t(t(countsA)/sizeFactorsA), t(t(countsB)/sizeFactorsB)))
    fullVarsA <- pmax(mus * sum(sizeFactorsA) + dispsA * mus^2 * 
        sum(sizeFactorsA^2), mus * sum(sizeFactorsA) * (1 + 1e-08))
    fullVarsB <- pmax(mus * sum(sizeFactorsB) + dispsB * mus^2 * 
        sum(sizeFactorsB^2), mus * sum(sizeFactorsB) * (1 + 1e-08))
    sumDispsA <- (fullVarsA - mus * sum(sizeFactorsA))/(mus * 
        sum(sizeFactorsA))^2
    sumDispsB <- (fullVarsB - mus * sum(sizeFactorsB))/(mus * 
        sum(sizeFactorsB))^2
    sapply(1:length(kAs), function(i) {
        if (kAs[i] == 0 & kBs[i] == 0) 
            return(NA)
        ks <- 0:(kAs[i] + kBs[i])
        ps <- dnbinom(ks, mu = mus[i] * sum(sizeFactorsA), size = 1/sumDispsA[i]) * 
            dnbinom(kAs[i] + kBs[i] - ks, mu = mus[i] * sum(sizeFactorsB), 
                size = 1/sumDispsB[i])
#        pobs <- dnbinom(kAs[i], mu = mus[i] * sum(sizeFactorsA), 
#            size = 1/sumDispsA[i]) * dnbinom(kBs[i], mu = mus[i] * 
#            sum(sizeFactorsB), size = 1/sumDispsB[i])
#        stopifnot(pobs == ps[kAs[i] + 1])
#        if (kAs[i] * sum(sizeFactorsB) < kBs[i] * sum(sizeFactorsA)) 
#            numer <- ps[1:(kAs[i] + 1)]
#        else numer <- ps[(kAs[i] + 1):length(ps)]
#        min(1, 2 * sum(numer)/sum(ps))
#
#		Replacement code for 2-sided p-value without 'flagpole'
#
		probs <- ps/sum(ps)       
        pValue <- twoSidedPValueFromDiscrete(probs, kAs[i])
        return(pValue)
#
     })
}
#
#########################################################################################
#
#	The function replacing the original function exactTest()
#
#	Conrad Burden, April 2013
#	Adapted from original R code by Mark Robinson, Davis McCarthy and Gordon Smyth
#
pfExactTest <- function (object, pair = 1:2, dispersion = "auto", rejection.region = "doubletail", 
    big.count = 900, prior.count.total = 0.5) 
{
    if (!is(object, "DGEList")) 
        stop("Currently only supports DGEList objects as the object argument.")
    if (length(pair) != 2) 
        stop("Pair must be of length 2.")
    rejection.region <- match.arg(rejection.region, c("doubletail", 
        "deviance", "smallp"))
    group <- as.factor(object$samples$group)
    levs.group <- levels(group)
    if (is.numeric(pair)) 
        pair <- levs.group[pair]
    else pair <- as.character(pair)
    if (!all(pair %in% levs.group)) 
        stop("At least one element of given pair is not a group.\n Groups are: ", 
            paste(levs.group, collapse = " "), "\n")
    if (is.null(dispersion)) 
        dispersion <- "auto"
    if (is.character(dispersion)) {
        dispersion <- match.arg(dispersion, c("auto", "common", 
            "trended", "tagwise"))
        dispersion <- switch(dispersion, common = object$common.dispersion, 
            trended = object$trended.dispersion, tagwise = object$tagwise.dispersion, 
            auto = getDispersion(object))
        if (is.null(dispersion)) 
            stop("specified dispersion not found in object")
    }
    ldisp <- length(dispersion)
    ntags <- nrow(object$counts)
    if (ldisp != 1 && ldisp != ntags) 
        stop("Dispersion provided by user must have length either 1 or the number of tags in the DGEList object.")
    if (ldisp == 1) 
        dispersion <- rep(dispersion, ntags)
    group <- as.character(group)
    j <- group %in% pair
    y <- object$counts[, j, drop = FALSE]
    lib.size <- object$samples$lib.size[j]
    norm.factors <- object$samples$norm.factors[j]
    group <- group[j]
    if (is.null(rownames(y))) 
        rownames(y) <- paste("tag", 1:ntags, sep = ".")
    lib.size <- lib.size * norm.factors
    offset <- log(lib.size)
    lib.size.average <- exp(mean(offset))
    abundance <- mglmOneGroup(y, dispersion = dispersion, offset = offset)
    logCPM <- (abundance + log(1e+06))/log(2)
    prior.count <- lib.size
    prior.count <- prior.count.total * prior.count/sum(prior.count)
    j1 <- group == pair[1]
    n1 <- sum(j1)
    if (n1 == 0) 
        stop("No libraries for", pair[1])
    y1 <- y[, j1, drop = FALSE]
    abundance1 <- mglmOneGroup(y1 + matrix(prior.count[j1], ntags, 
        n1, byrow = TRUE), offset = offset[j1])
    j2 <- group == pair[2]
    n2 <- sum(j2)
    if (n1 == 0) 
        stop("No libraries for", pair[2])
    y2 <- y[, j2, drop = FALSE]
    abundance2 <- mglmOneGroup(y2 + matrix(prior.count[j2], ntags, 
        n2, byrow = TRUE), offset = offset[j2])
    logFC <- (abundance2 - abundance1)/log(2)
    e <- exp(abundance)
    input.mean <- matrix(e, ntags, n1)
    output.mean <- input.mean * lib.size.average
    input.mean <- t(t(input.mean) * lib.size[j1])
    y1 <- q2qnbinom(y1, input.mean = input.mean, output.mean = output.mean, 
        dispersion = dispersion)
    input.mean <- matrix(e, ntags, n2)
    output.mean <- input.mean * lib.size.average
    input.mean <- t(t(input.mean) * lib.size[j2])
    y2 <- q2qnbinom(y2, input.mean = input.mean, output.mean = output.mean, 
        dispersion = dispersion)
#
#	Original code commented out
#	
#   exact.pvals <- switch(rejection.region, doubletail = exactTestDoubleTail(y1, 
#       y2, dispersion = dispersion, big.count = big.count), 
#       deviance = exactTestByDeviance(y1, y2, dispersion = dispersion, 
#       	big.count = big.count), smallp = exactTestBySmallP(y1, 
#       	y2, dispersion = dispersion, big.count = big.count))
#
#	Replacement code calls new version of exactTestDoubleTail()
#
    exact.pvals <- switch(rejection.region, doubletail = pfExactTestDoubleTail(y1, 
        y2, dispersion = dispersion, big.count = big.count), 
        deviance = exactTestByDeviance(y1, y2, dispersion = dispersion, 
            big.count = big.count), smallp = exactTestBySmallP(y1, 
            y2, dispersion = dispersion, big.count = big.count))
 #
    de.out <- data.frame(logFC = logFC, logCPM = logCPM, PValue = exact.pvals)
    rn <- rownames(object$counts)
    if (!is.null(rn)) 
        rownames(de.out) <- make.unique(rn)
    new("DGEExact", list(table = de.out, comparison = pair, genes = object$genes))
}
#
#########################################################################################
#
#	The function replacing the original function exactTestDoubleTail()
#
#	Conrad Burden, April 2013
#	Adapted from original R code by Mark Robinson, Davis McCarthy and Gordon Smyth
#
pfExactTestDoubleTail <- function (y1, y2, dispersion = 0, big.count = 900) 
{
    ntags <- NROW(y1)
    n1 <- NCOL(y1)
    n2 <- NCOL(y2)
    if (n1 > 1) 
        s1 <- round(rowSums(y1))
    else s1 <- round(y1)
    if (n2 > 1) 
        s2 <- round(rowSums(y2))
    else s2 <- round(y2)
    if (length(dispersion) == 1) 
        dispersion <- rep(dispersion, ntags)
    s <- s1 + s2
    mu <- s/(n1 + n2)
    mu1 <- n1 * mu
    mu2 <- n2 * mu
    pvals <- rep(1, ntags)
    names(pvals) <- names(y1)
    pois <- dispersion <= 0
    if (any(pois)) 
        pvals[pois] <- binomTest(pmax(s1[pois], 0), pmax(s2[pois], 
            0), p = n1/(n1 + n2))
    big <- s1 > big.count & s2 > big.count
    if (any(big)) {
        y1 <- as.matrix(y1)
        y2 <- as.matrix(y2)
        pvals[big] <- exactTestBetaApprox(y1[big, , drop = FALSE], 
            y2[big, , drop = FALSE], dispersion[big])
    }
#
#	Original code commented out
#	
#    p.bot <- size1 <- size2 <- rep(0, ntags)
#    left <- s1 < mu1 & !pois & !big
#    if (any(left)) {
#        p.bot[left] <- dnbinom(s[left], size = (n1 + n2)/dispersion[left], 
#            mu = s[left])
#        size1[left] <- n1/dispersion[left]
#        size2[left] <- n2/dispersion[left]
#        for (g in which(left)) {
#            x <- 0:s1[g]
#            p.top <- dnbinom(x, size = size1[g], mu = mu1[g]) * 
#                dnbinom(s[g] - x, size = size2[g], mu = mu2[g])
#            pvals[g] <- 2 * sum(p.top)
#        }
#        pvals[left] <- pvals[left]/p.bot[left]
#    }
#    right <- s1 > mu1 & !pois & !big
#    if (any(right)) {
#        p.bot[right] <- dnbinom(s[right], size = (n1 + n2)/dispersion[right], 
#            mu = s[right])
#        size1[right] <- n1/dispersion[right]
#        size2[right] <- n2/dispersion[right]
#        for (g in which(right)) {
#            x <- s1[g]:s[g]
#            p.top <- dnbinom(x, size = size1[g], mu = mu1[g]) * 
#                dnbinom(s[g] - x, size = size2[g], mu = mu2[g])
#            pvals[g] <- 2 * sum(p.top)
#        }
#        pvals[right] <- pvals[right]/p.bot[right]
#      }
#
#	Replacement code for 2-sided p-value without 'flagpole'
#
    p.bot <- size1 <- size2 <- rep(0, ntags)
	remaining <- !pois & !big
	if (any(remaining)) {
        p.bot[remaining] <- dnbinom(s[remaining], size = (n1 + n2)/dispersion[remaining], 
            mu = s[remaining])
        size1[remaining] <- n1/dispersion[remaining]
        size2[remaining] <- n2/dispersion[remaining]
        for (g in which(remaining)) {
			x <- 0:s[g]
			p.distr <- dnbinom(x, size = size1[g], mu = mu1[g]) * 
                dnbinom(s[g] - x, size = size2[g], mu = mu2[g]) / p.bot[g]
            pvals[g] <- twoSidedPValueFromDiscrete(p.distr, s1[g])
            }
        }
#
    pmin(pvals, 1)
}
#
#########################################################################################
#
#	Function to calculate a 2-sided p-value of an observation xobs for a finite 
#	discrete distribution
#		Prob(X = xobs) = probs[xobs + 1]
#	over the range xobs in (0, 1, ..., xmax) by "squaring off" the distribution to a continuous 
#   distribution
#
#	Arguments: 
#	probs	an array containing the probabilities that X takes the values 0, 1, ...
#	xobs	the observed value of X  
#
#	Note that the returned 2-sided p-value contains a random component, i.e. a given 
#	set of input parameters returns a different result each time 
#
#	April 2013
#	Conrad Burden
#
	twoSidedPValueFromDiscrete <- function(probs,xobs){
		if(!all(probs >= 0)){
			stop("probs contains negative values \n") 
			}
		if(abs(sum(probs) - 1) >1.e-10){
			warning("probs do not sum to 1 and will be normalised \n") 
			}
    probs <- probs/sum(probs)
		xmax <- length(probs) - 1
		if(length(xobs)!= 1){
		  stop("xobs not a single value in the range of probs") 
		}
		if(!is.element(xobs, 0:xmax)){
		  stop("xobs not a single value in the range of probs") 
		}
		#
#	choose xcut randomly and uniformly between xobs and (xobs + 1)
#
		randomFrac <- runif(1)
		xcut <- xobs + randomFrac
#
#	p-value calculated either from lower or upper tail of "squared-off" distribution
#
		distribFunct <- c(0, cumsum(probs))   
		leftSidePValue <- distribFunct[xobs+1] + randomFrac*probs[xobs+1]
		pVal <- 2*min(leftSidePValue, 1 - leftSidePValue)
		return(pVal)
		}
#
#########################################################################################
#
#	Function to level out a p-value spectrum generated by DESeq or edgeR by fitting a 
#		quadratic function to the right hand portion of the spectrum,
#		produce 'corrected' p-values and q-values using an adapted version of the 
#		Storey-Tibsharini procedure
#
#	Arguments:
#	oldPvalues	an array of p-values produced by the replacement DESeq function 
#				pfNbinomTest() or the replacement edgeR function pfExactTest()
#	plot 		TRUE to plot original and corrected pvalue spectra; FALSE not to plot
#
#	Returns a list containing:
#		$pi0estimate	an estimate of the proportion of genes not differentially expressed
#		$lambdaOptimal	the point in the p-value spectrum past which a quadratic is fitted
#		$pValueCorr		p-values calculated from the levelled spectrum 
#		$qValueCorr		q-values calculated from the levelled spectrum 
#		$qValueCorrBH	q-values calculated from $pValueCorr using Benjamini-Hochberg 
#
#	April 2013
#	Conrad Burden
#
	levelPValues <- function(oldPvals, plot=TRUE){
#
#
		nGenes <- length(oldPvals) 
		originalHist <- hist(oldPvals, breaks=seq(0,1,by=0.01), plot=FALSE)
		x <- originalHist$mids		# x and y are the coordintes of the mipoints of 
		y <- originalHist$counts	#  the tops of the bars of the histograms
#
#	Define the quadratic fitting function and its integral:
# 
		quadFunct <- function(x, a){
			quadFunct <- a[1] + a[2]*x + a[3]*x^2 
			quadFunct
			}
#
		intQuadFunct <- function(x, a){
			intQuadFunct <- a[1]*x + a[2]*x^2/2 + a[3]*x^3/3
			intQuadFunct
			}
#
#	Set up some arrays for loop over lambda
# 
		lambdaArray <- seq(0.1, 0.95, by=0.01)
		aEstimate <- array(dim=c(3,length(lambdaArray)))
		pi0hat <- array(dim=length(lambdaArray))
		astart <- c(lm(y~x)$coeff, 0)
		lambdaIndex <- 0
#
		for(lambda in lambdaArray){
				lambdaIndex <- lambdaIndex + 1
				xLambda <- x[x>=lambda]
				yLambda <- y[x>=lambda]
#	
#	Define a function equal to the sum of squares residuals, which gets minimised
#
				sumSq <- function(a){
					sumSq <- sum((yLambda - quadFunct(xLambda, a))^2)
					sumSq
					}
#
#	Minimise the sum of squares residuals, 
#	fitted parameters are stored in the variable aEstimate
#	nlm() needs an initial guess for the parameters:
#		use a straight line fit for the first 2 parameters first time through
#		then use aEstimate from previous lambda
#
				fit <-  nlm(sumSq, astart)
				aEstimate[, lambdaIndex] <- fit$estimate
				astart <- aEstimate[, lambdaIndex]
#
#
#	piZero(lambda) estimate: (area under the fitted quadratic from 0 to 1)/(area under histogram)
#
		pi0hat[lambdaIndex] <- intQuadFunct(1, astart)/sum(y)/0.01
		}
#
#
#	Calculate a density plot using only the 'reasonable' pi0hat's
#
		reasonablePi0hats <- pi0hat>=0 & pi0hat<2
		pi0hatR <- pi0hat[reasonablePi0hats]
#
#	Optimal lambda and pi_0 got by seeing where density of pi0 peaks
#
		ddd <- density(pi0hatR, na.rm=TRUE)
		pi0estimate <- ddd$x[which.max(ddd$y)]
#		
		lambdaOptimal <- lambdaArray[which.min(abs(pi0hatR-pi0estimate))]
		aEst <- aEstimate[,which.min(abs(pi0hat-pi0estimate))]
#
#	Calculate 'corrected' p-values
#
		pValueCorr <- apply(as.array(oldPvals), 1, FUN=intQuadFunct, a=aEst)	
		pValueCorr <- pValueCorr/intQuadFunct(1, aEst)
#
#	Calculate 'corrected' q-values
#
		nGenes1 <- length(oldPvals[!is.na(oldPvals)])
		TPplusFP <- array(dim=nGenes)
		TPplusFP[order(pValueCorr)] <- (1:nGenes)   # any NAs present are ordered last
		FP <- pValueCorr*pi0estimate*nGenes1
		qValueCorr <- FP/TPplusFP
		qValueCorrBH <- p.adjust(pValueCorr, method="BH")
#
#	Plots
#
		if (plot){
			if(.Platform$OS.type=="unix") {
				quartz(height=8, width=8)
				}
			if(.Platform$OS.type=="windows") {
				windows(height=8, width=8)
				}
			oldpar <- par(mfrow=c(2,2))
#
#	Plot pi0 estimates at each lambda
#	
			plot(lambdaArray,pi0hat, ylim=c(0.6,1.2), pch=16, cex=0.9, xlim=c(0, 1),
				xlab=expression(lambda), ylab = expression(hat(pi)[0](lambda)), 
				main = "")
			mtext(substitute(hat(pi)[0]*" = "*this*"  at  "*lambda*" = "*that, 
							list(this=round(pi0estimate, digits = 3), 
								 that=round(lambdaOptimal, digits = 2))), 
		  					side = 3, font=10)
			abline(h=pi0estimate, lty=2, cex=0.5)
			points(lambdaOptimal, pi0estimate,col="red", pch=16, cex=1.25)	
#
#	Plot histogram of original p-values and superimpose quadratic
#
			plot(originalHist, 
					xlab = "p-value", main="Original P-value spectrum")
			xPointsLeft <- seq(0,lambdaOptimal,by=0.01)
			xPointsRight <- seq(lambdaOptimal,1,by=0.01)
			points(xPointsLeft, quadFunct(xPointsLeft,aEst), type = "l", col= "green",lwd=1.25)
			points(xPointsRight, quadFunct(xPointsRight,aEst), type = "l", col= "red",lwd=1.25)
#
#	Plot density of pi0 estimatesat each lambda
#	
			plot(ddd, xlab = expression(hat(pi)[0](lambda)), main = "")
#
#	Plot histogram of corrected p-values and straight line at pi0hat
#
			hist(pValueCorr, breaks=seq(0,1,by=0.01), xlab = "corrected p-value", 
					main="Corrected P-value spectrum")
#
			height <- nGenes1*pi0estimate/100
			lambdaCorr <- intQuadFunct(lambdaOptimal, aEst)/intQuadFunct(1, aEst)
			points(c(0,lambdaCorr),c(height,height), col="green", type="l",lwd=1.25)
			points(c(lambdaCorr,1),c(height,height), col="red", type="l",lwd=1.25)			
#
			par(oldpar)
			}
#
		list(pi0estimate=pi0estimate, lambdaOptimal=lambdaOptimal, 
			pValueCorr=pValueCorr, qValueCorr=qValueCorr, qValueCorrBH=qValueCorrBH)
		}
#